library(car)
## Loading required package: carData
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(readxl)
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
#load the dataset
mydata <- read.csv('./data/day.csv')
#convert 'dteday' column to Date format
mydata$dteday <- as.Date(mydata$dteday)
#season
mydata$season <- cut(mydata$season,
breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
labels = c("Winter", "Spring", "Summer", "Fall"))
mydata$season <- factor(mydata$season, levels = c("Winter", "Spring", "Summer", "Fall"))
#workingday
mydata$workingday <- ifelse(mydata$workingday == 0, "Not_Workingday", "Workingday")
mydata$workingday <- factor(mydata$workingday, levels = c("Not_Workingday", "Workingday"))
#weather
mydata$weathersit <- cut(mydata$weathersit,
breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
labels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))
mydata$weathersit <- factor(mydata$weathersit, levels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))
head(mydata)
## instant dteday season yr mnth holiday weekday workingday weathersit
## 1 1 2011-01-01 Winter 0 1 0 6 Not_Workingday Weather_2
## 2 2 2011-01-02 Winter 0 1 0 0 Not_Workingday Weather_2
## 3 3 2011-01-03 Winter 0 1 0 1 Workingday Weather_1
## 4 4 2011-01-04 Winter 0 1 0 2 Workingday Weather_1
## 5 5 2011-01-05 Winter 0 1 0 3 Workingday Weather_1
## 6 6 2011-01-06 Winter 0 1 0 4 Workingday Weather_1
## temp atemp hum windspeed casual registered cnt
## 1 0.344167 0.363625 0.805833 0.1604460 331 654 985
## 2 0.363478 0.353739 0.696087 0.2485390 131 670 801
## 3 0.196364 0.189405 0.437273 0.2483090 120 1229 1349
## 4 0.200000 0.212122 0.590435 0.1602960 108 1454 1562
## 5 0.226957 0.229270 0.436957 0.1869000 82 1518 1600
## 6 0.204348 0.233209 0.518261 0.0895652 88 1518 1606
We are splitting the dataset in 2 parts, first part is year 2011 and the second part is year 2012. Then we are splitting subset of year 2011 to be 50% training dataset and 50% validation dataset. The year 2012 dataset will be used for prediction.
# Set a seed for reproducibility
# Setting a seed is crucial for reproducibility and consistency of results.
# It ensures that when you run the same code multiple times, you'll ...
# ... obtain the same results.
set.seed(20231103)
# Filter dataset for year 2011 (0) and year 2012 (1)
year2011 <- mydata[mydata$yr == 0, ]
year2012 <- mydata[mydata$yr == 1, ]
# Add 'Type' column and assign values
year2011$Type <- "Training"
year2012$Type <- "Validation"
# Combine training and validation data
mydata <- rbind(year2011, year2012)
We choose 3 questions of interest:
#registered
ggplot(mydata, aes(x = Type, y = registered, color = Type)) +
geom_boxplot(position = position_dodge(width = 0.8)) +
labs(x = "Type", y = "# Registered Users") +
scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
ggtitle("# Registered Users by Type (Training vs. Validation)") +
theme_bw()
#sqrt(casual)
ggplot(mydata, aes(x = Type, y = sqrt(casual), color = Type)) +
geom_boxplot(position = position_dodge(width = 0.8)) +
labs(x = "Type", y = "Square Root # Casual Users") +
scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
ggtitle("Square Root # Casual Users by Type (Training vs. Validation)") +
theme_bw()
#cnt
ggplot(mydata, aes(x = Type, y = cnt, color = Type)) +
geom_boxplot(position = position_dodge(width = 0.8)) +
labs(x = "Type", y = "# Total Users") +
scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
ggtitle("# Total Users by Type (Training vs. Validation)") +
theme_bw()
# Function to calculate statistics
calculate_statistics <- function(x) {
return(c(
Min = min(x, na.rm = TRUE),
Q1 = quantile(x, 0.25, na.rm = TRUE),
Median = median(x, na.rm = TRUE),
Mean = mean(x, na.rm = TRUE),
Q3 = quantile(x, 0.75, na.rm = TRUE),
Max = max(x, na.rm = TRUE)
))
}
#creating training data our of mydata (training -> data from 2011)
training_data <- subset(mydata, Type == "Training")
summary(training_data)
## instant dteday season yr mnth
## Min. : 1 Min. :2011-01-01 Winter:90 Min. :0 Min. : 1.000
## 1st Qu.: 92 1st Qu.:2011-04-02 Spring:92 1st Qu.:0 1st Qu.: 4.000
## Median :183 Median :2011-07-02 Summer:94 Median :0 Median : 7.000
## Mean :183 Mean :2011-07-02 Fall :89 Mean :0 Mean : 6.526
## 3rd Qu.:274 3rd Qu.:2011-10-01 3rd Qu.:0 3rd Qu.:10.000
## Max. :365 Max. :2011-12-31 Max. :0 Max. :12.000
## holiday weekday workingday weathersit
## Min. :0.0000 Min. :0.000 Not_Workingday:115 Weather_1:226
## 1st Qu.:0.0000 1st Qu.:1.000 Workingday :250 Weather_2:124
## Median :0.0000 Median :3.000 Weather_3: 15
## Mean :0.0274 Mean :3.008 Weather_4: 0
## 3rd Qu.:0.0000 3rd Qu.:5.000
## Max. :1.0000 Max. :6.000
## temp atemp hum windspeed
## Min. :0.05913 Min. :0.07907 Min. :0.0000 Min. :0.02239
## 1st Qu.:0.32500 1st Qu.:0.32195 1st Qu.:0.5383 1st Qu.:0.13558
## Median :0.47917 Median :0.47285 Median :0.6475 Median :0.18690
## Mean :0.48666 Mean :0.46684 Mean :0.6437 Mean :0.19140
## 3rd Qu.:0.65667 3rd Qu.:0.61238 3rd Qu.:0.7421 3rd Qu.:0.23508
## Max. :0.84917 Max. :0.84090 Max. :0.9725 Max. :0.50746
## casual registered cnt Type
## Min. : 9.0 Min. : 416 Min. : 431 Length:365
## 1st Qu.: 222.0 1st Qu.:1730 1st Qu.:2132 Class :character
## Median : 614.0 Median :2915 Median :3740 Mode :character
## Mean : 677.4 Mean :2728 Mean :3406
## 3rd Qu.: 871.0 3rd Qu.:3632 3rd Qu.:4586
## Max. :3065.0 Max. :4614 Max. :6043
#creating training data our of mydata (validatioin -> data from 2012)
validation_data <- subset(mydata, Type == "Validation")
summary(validation_data)
## instant dteday season yr mnth
## Min. :366.0 Min. :2012-01-01 Winter:91 Min. :1 Min. : 1.000
## 1st Qu.:457.2 1st Qu.:2012-04-01 Spring:92 1st Qu.:1 1st Qu.: 4.000
## Median :548.5 Median :2012-07-01 Summer:94 Median :1 Median : 7.000
## Mean :548.5 Mean :2012-07-01 Fall :89 Mean :1 Mean : 6.514
## 3rd Qu.:639.8 3rd Qu.:2012-09-30 3rd Qu.:1 3rd Qu.: 9.750
## Max. :731.0 Max. :2012-12-31 Max. :1 Max. :12.000
## holiday weekday workingday weathersit
## Min. :0.00000 Min. :0.000 Not_Workingday:116 Weather_1:237
## 1st Qu.:0.00000 1st Qu.:1.000 Workingday :250 Weather_2:123
## Median :0.00000 Median :3.000 Weather_3: 6
## Mean :0.03005 Mean :2.986 Weather_4: 0
## 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :1.00000 Max. :6.000
## temp atemp hum windspeed
## Min. :0.1075 Min. :0.1017 Min. :0.2542 Min. :0.04665
## 1st Qu.:0.3477 1st Qu.:0.3507 1st Qu.:0.5081 1st Qu.:0.13372
## Median :0.5142 Median :0.4978 Median :0.6119 Median :0.17475
## Mean :0.5041 Mean :0.4819 Mean :0.6122 Mean :0.18957
## 3rd Qu.:0.6540 3rd Qu.:0.6076 3rd Qu.:0.7111 3rd Qu.:0.23120
## Max. :0.8617 Max. :0.8049 Max. :0.9250 Max. :0.44156
## casual registered cnt Type
## Min. : 2.0 Min. : 20 Min. : 22 Length:366
## 1st Qu.: 429.8 1st Qu.:3730 1st Qu.:4369 Class :character
## Median : 904.5 Median :4776 Median :5927 Mode :character
## Mean :1018.5 Mean :4581 Mean :5600
## 3rd Qu.:1262.0 3rd Qu.:5663 3rd Qu.:7011
## Max. :3410.0 Max. :6946 Max. :8714
We are fitting full model which will be used to stepwise regression later. We address multicollinearity issue and delete variables that might cause it.
full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model3 <- lm( cnt ~ casual +as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
summary(full_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1831.18 -249.18 45.54 306.01 1029.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 939.50 182.22 5.156 4.21e-07 ***
## as.factor(workingday)Workingday 698.55 55.36 12.617 < 2e-16 ***
## as.factor(season)Spring 812.16 92.94 8.738 < 2e-16 ***
## as.factor(season)Summer 794.67 121.99 6.514 2.52e-10 ***
## as.factor(season)Fall 1306.52 82.32 15.872 < 2e-16 ***
## holiday -196.65 156.92 -1.253 0.210974
## weekday 17.57 12.46 1.410 0.159397
## temp 2757.41 236.16 11.676 < 2e-16 ***
## hum -610.45 230.43 -2.649 0.008431 **
## windspeed -1469.23 351.94 -4.175 3.76e-05 ***
## as.factor(weathersit)Weather_2 -224.29 66.00 -3.398 0.000756 ***
## as.factor(weathersit)Weather_3 -1377.56 146.15 -9.426 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 471.3 on 353 degrees of freedom
## Multiple R-squared: 0.8083, Adjusted R-squared: 0.8024
## F-statistic: 135.3 on 11 and 353 DF, p-value: < 2.2e-16
summary(full_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0473 -3.0247 -0.0296 2.7154 18.3344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.89493 1.84453 10.786 < 2e-16 ***
## as.factor(workingday)Workingday -11.35502 0.56044 -20.261 < 2e-16 ***
## as.factor(season)Spring 6.26683 0.94083 6.661 1.05e-10 ***
## as.factor(season)Summer 3.36323 1.23484 2.724 0.00678 **
## as.factor(season)Fall 4.43365 0.83329 5.321 1.84e-07 ***
## holiday -2.56032 1.58848 -1.612 0.10790
## weekday 0.09252 0.12616 0.733 0.46381
## temp 32.15458 2.39057 13.451 < 2e-16 ***
## hum -6.06923 2.33258 -2.602 0.00966 **
## windspeed -14.22007 3.56256 -3.992 7.99e-05 ***
## as.factor(weathersit)Weather_2 -2.09773 0.66811 -3.140 0.00183 **
## as.factor(weathersit)Weather_3 -8.83239 1.47944 -5.970 5.78e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.771 on 353 degrees of freedom
## Multiple R-squared: 0.8019, Adjusted R-squared: 0.7957
## F-statistic: 129.9 on 11 and 353 DF, p-value: < 2.2e-16
summary(full_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2086.82 -250.88 43.02 316.95 994.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.822e+02 1.847e+02 3.694 0.000255 ***
## casual 1.393e+00 8.172e-02 17.048 < 2e-16 ***
## as.factor(workingday)Workingday 9.622e+02 7.673e+01 12.540 < 2e-16 ***
## as.factor(season)Spring 7.127e+02 9.250e+01 7.705 1.35e-13 ***
## as.factor(season)Summer 7.396e+02 1.189e+02 6.221 1.40e-09 ***
## as.factor(season)Fall 1.250e+03 8.071e+01 15.488 < 2e-16 ***
## holiday -1.457e+02 1.526e+02 -0.955 0.340318
## weekday 1.545e+01 1.210e+01 1.277 0.202431
## temp 2.167e+03 2.599e+02 8.338 1.72e-15 ***
## hum -4.957e+02 2.248e+02 -2.205 0.028098 *
## windspeed -1.132e+03 3.485e+02 -3.248 0.001273 **
## as.factor(weathersit)Weather_2 -1.881e+02 6.446e+01 -2.918 0.003754 **
## as.factor(weathersit)Weather_3 -1.256e+03 1.440e+02 -8.721 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.2 on 352 degrees of freedom
## Multiple R-squared: 0.8937, Adjusted R-squared: 0.8901
## F-statistic: 246.6 on 12 and 352 DF, p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)
avPlots(full_model2)
avPlots(full_model3)
stepwise_model1 <- step(full_model1, direction = "both")
## Start: AIC=4505.31
## registered ~ as.factor(workingday) + as.factor(season) + holiday +
## weekday + temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - holiday 1 348826 78757811 4504.9
## <none> 78408985 4505.3
## - weekday 1 441649 78850634 4505.4
## - hum 1 1558913 79967898 4510.5
## - windspeed 1 3871169 82280153 4520.9
## - as.factor(weathersit) 2 19753550 98162534 4583.3
## - temp 1 30282108 108691093 4622.5
## - as.factor(workingday) 1 35361497 113770481 4639.2
## - as.factor(season) 3 60396050 138805035 4707.8
##
## Step: AIC=4504.93
## registered ~ as.factor(workingday) + as.factor(season) + weekday +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 78757811 4504.9
## - weekday 1 509480 79267291 4505.3
## + holiday 1 348826 78408985 4505.3
## - hum 1 1478325 80236136 4509.7
## - windspeed 1 3852838 82610649 4520.4
## - as.factor(weathersit) 2 19849983 98607794 4583.0
## - temp 1 30118568 108876379 4621.1
## - as.factor(workingday) 1 39599907 118357718 4651.6
## - as.factor(season) 3 60285076 139042887 4706.4
stepwise_model2 <- step(full_model2, direction = "both")
## Start: AIC=1152.44
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday +
## weekday + temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - weekday 1 12.2 8046.8 1151.0
## <none> 8034.6 1152.4
## - holiday 1 59.1 8093.7 1153.1
## - hum 1 154.1 8188.7 1157.4
## - windspeed 1 362.6 8397.2 1166.5
## - as.factor(weathersit) 2 829.0 8863.6 1184.3
## - as.factor(season) 3 1441.3 9475.9 1206.7
## - temp 1 4117.9 12152.4 1301.5
## - as.factor(workingday) 1 9343.4 17378.0 1432.0
##
## Step: AIC=1151
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 8046.8 1151.0
## - holiday 1 63.9 8110.7 1151.9
## + weekday 1 12.2 8034.6 1152.4
## - hum 1 164.6 8211.5 1156.4
## - windspeed 1 359.7 8406.5 1165.0
## - as.factor(weathersit) 2 818.8 8865.6 1182.4
## - as.factor(season) 3 1445.3 9492.1 1205.3
## - temp 1 4107.9 12154.7 1299.5
## - as.factor(workingday) 1 9348.6 17395.5 1430.4
stepwise_model3 <- step(full_model3, direction = "both")
## Start: AIC=4484.06
## cnt ~ casual + as.factor(workingday) + as.factor(season) + holiday +
## weekday + temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - holiday 1 190554 73761582 4483.0
## - weekday 1 340857 73911885 4483.7
## <none> 73571028 4484.1
## - hum 1 1016235 74587264 4487.1
## - windspeed 1 2205158 75776187 4492.8
## - temp 1 14531329 88102358 4547.9
## - as.factor(weathersit) 2 15979331 89550360 4551.8
## - as.factor(workingday) 1 32868919 106439948 4616.9
## - as.factor(season) 3 54301778 127872806 4679.8
## - casual 1 60747394 134318422 4701.8
##
## Step: AIC=4483
## cnt ~ casual + as.factor(workingday) + as.factor(season) + weekday +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - weekday 1 383340 74144922 4482.9
## <none> 73761582 4483.0
## + holiday 1 190554 73571028 4484.1
## - hum 1 963292 74724874 4485.7
## - windspeed 1 2178046 75939628 4491.6
## - temp 1 14386687 88148269 4546.0
## - as.factor(weathersit) 2 15987090 89748672 4550.6
## - as.factor(workingday) 1 35790437 109552019 4625.4
## - as.factor(season) 3 54174909 127936491 4678.0
## - casual 1 61516967 135278549 4702.4
##
## Step: AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp +
## hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 74144922 4482.9
## + weekday 1 383340 73761582 4483.0
## + holiday 1 233037 73911885 4483.7
## - hum 1 1087039 75231961 4486.2
## - windspeed 1 2121892 76266814 4491.2
## - temp 1 14190777 88335699 4544.8
## - as.factor(weathersit) 2 15737193 89882115 4549.2
## - as.factor(workingday) 1 36139539 110284461 4625.8
## - as.factor(season) 3 54380050 128524972 4677.7
## - casual 1 62035815 136180737 4702.8
summary(stepwise_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1834.15 -238.70 46.37 304.93 1029.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 911.40 180.98 5.036 7.59e-07 ***
## as.factor(workingday)Workingday 715.86 53.66 13.341 < 2e-16 ***
## as.factor(season)Spring 815.27 92.98 8.768 < 2e-16 ***
## as.factor(season)Summer 798.73 122.04 6.545 2.09e-10 ***
## as.factor(season)Fall 1305.98 82.38 15.853 < 2e-16 ***
## weekday 18.82 12.43 1.513 0.13110
## temp 2748.78 236.25 11.635 < 2e-16 ***
## hum -593.43 230.21 -2.578 0.01035 *
## windspeed -1465.70 352.21 -4.161 3.97e-05 ***
## as.factor(weathersit)Weather_2 -229.81 65.91 -3.487 0.00055 ***
## as.factor(weathersit)Weather_3 -1381.03 146.24 -9.444 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 471.7 on 354 degrees of freedom
## Multiple R-squared: 0.8075, Adjusted R-squared: 0.802
## F-statistic: 148.5 on 10 and 354 DF, p-value: < 2.2e-16
summary(stepwise_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## holiday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.7811 -2.9288 -0.0188 2.7108 18.2147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.2681 1.7718 11.439 < 2e-16 ***
## as.factor(workingday)Workingday -11.3579 0.5601 -20.280 < 2e-16 ***
## as.factor(season)Spring 6.2838 0.9399 6.685 8.99e-11 ***
## as.factor(season)Summer 3.3963 1.2332 2.754 0.00619 **
## as.factor(season)Fall 4.4543 0.8323 5.352 1.57e-07 ***
## holiday -2.6529 1.5824 -1.677 0.09452 .
## temp 32.1003 2.3879 13.443 < 2e-16 ***
## hum -6.2416 2.3192 -2.691 0.00746 **
## windspeed -14.1586 3.5593 -3.978 8.43e-05 ***
## as.factor(weathersit)Weather_2 -2.0546 0.6651 -3.089 0.00217 **
## as.factor(weathersit)Weather_3 -8.7637 1.4755 -5.939 6.84e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.768 on 354 degrees of freedom
## Multiple R-squared: 0.8016, Adjusted R-squared: 0.796
## F-statistic: 143 on 10 and 354 DF, p-value: < 2.2e-16
summary(stepwise_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2060.85 -257.74 44.82 332.76 1011.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.189e+02 1.775e+02 4.050 6.29e-05 ***
## casual 1.403e+00 8.153e-02 17.210 < 2e-16 ***
## as.factor(workingday)Workingday 9.826e+02 7.480e+01 13.136 < 2e-16 ***
## as.factor(season)Spring 7.157e+02 9.258e+01 7.730 1.12e-13 ***
## as.factor(season)Summer 7.474e+02 1.189e+02 6.285 9.65e-10 ***
## as.factor(season)Fall 1.252e+03 8.076e+01 15.501 < 2e-16 ***
## temp 2.135e+03 2.594e+02 8.231 3.60e-15 ***
## hum -5.094e+02 2.236e+02 -2.278 0.02331 *
## windspeed -1.110e+03 3.486e+02 -3.183 0.00159 **
## as.factor(weathersit)Weather_2 -1.840e+02 6.418e+01 -2.867 0.00439 **
## as.factor(weathersit)Weather_3 -1.243e+03 1.438e+02 -8.645 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared: 0.8928, Adjusted R-squared: 0.8898
## F-statistic: 295 on 10 and 354 DF, p-value: < 2.2e-16
avPlots(stepwise_model1)
avPlots(stepwise_model2)
avPlots(stepwise_model3)
For vif, values close to 1 indicate no multicollinearity, values between 1-3 indicate mild collinearity which is not ideal but can be considered ok since it won’t severaly impact the interpretability of the coefficients, values above indicate severe multicollinearity which is considered unreliable.
vif(stepwise_model1)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019319 1 1.009613
## as.factor(season) 3.639198 3 1.240226
## weekday 1.017995 1 1.008957
## temp 3.282494 1 1.811766
## hum 1.918458 1 1.385084
## windspeed 1.199915 1 1.095406
## as.factor(weathersit) 1.834088 2 1.163737
vif(stepwise_model2)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.086910 1 1.042550
## as.factor(season) 3.640064 3 1.240276
## holiday 1.071406 1 1.035087
## temp 3.282132 1 1.811666
## hum 1.905606 1 1.380437
## windspeed 1.199328 1 1.095138
## as.factor(weathersit) 1.826326 2 1.162504
vif(stepwise_model3)
## GVIF Df GVIF^(1/(2*Df))
## casual 3.575021 1 1.890773
## as.factor(workingday) 2.104307 1 1.450623
## as.factor(season) 3.879645 3 1.253522
## temp 4.203539 1 2.050253
## hum 1.922350 1 1.386488
## windspeed 1.248688 1 1.117447
## as.factor(weathersit) 1.881977 2 1.171261
validation_data$predicted_registered <- predict(stepwise_model1, newdata = validation_data)
validation_data$predicted_sqrtcasual <- predict(stepwise_model2, newdata = validation_data)
validation_data$predicted_count <- predict(stepwise_model3, newdata = validation_data)
observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$predicted_registered
# Calculate different prediction performance metrics
# Functions from the MLmetrics package
# Common regression metrics
# Calculate the Root Mean Squared Error (RMSE), which measures the ...
# ... average magnitude of prediction errors.
# Lower is better.
rmse1 <- RMSE(predicted_values1, observed_values1)
# same as
# sqrt(mean((observed_values - predicted_values)^2))
# Compute the Mean Absolute Error (MAE), indicating the average absolute ...
# ... difference between predicted and observed values.
# Lower is better.
mae1 <- MAE(predicted_values1, observed_values1)
# same as
# mean(abs(observed_values - predicted_values))
# Calculate the Mean Absolute Percentage Error (MAPE), measuring the ...
# ... average percentage difference between predicted and observed values.
mape1 <- MAPE(predicted_values1, observed_values1)
# same as
# mean(abs(predicted_values-observed_values)/observed_values)
# Determine the R-squared (R²) Score, representing the proportion of the ...
# ... variance in the observed values (of validation data set) ...
# ... explained by the predicted values from the model.
# Higher is better.
r_squared1 <- R2_Score(predicted_values1, observed_values1)
# same as
# summary(lm(observed_values ~ predicted_values))$r.squared
# Display the calculated metrics
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1929.403
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1799.901
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.84
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6342
#second question
observed_values2 <- sqrt(validation_data$casual) #should I transform it here??
predicted_values2 <- validation_data$predicted_sqrtcasual
rmse2 <- RMSE(predicted_values2, observed_values2)
mae2 <- MAE(predicted_values2, observed_values2)
mape2 <- MAPE(predicted_values2, observed_values2)
r_squared2 <- R2_Score(predicted_values2, observed_values2)
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.5939
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9561
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.5865
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2126
#third question
observed_values3 <- validation_data$count
predicted_values3 <- validation_data$predicted_count
rmse3 <- RMSE(predicted_values3, observed_values3)
mae3 <- MAE(predicted_values3, observed_values3)
mape3 <- MAPE(predicted_values3, observed_values3)
r_squared3 <- R2_Score(predicted_values3, observed_values3)
## Warning in mean.default(y_true): argument is not numeric or logical: returning
## NA
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): NaN
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): NaN
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: NaN
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): NaN
we choose model 1 because it has best RMSE and R^2 (it is done for whole 2011 year)
#done on model 1
m.mlr <- lm(registered ~ as.factor(workingday) + as.factor(season) + holiday +
temp + hum + windspeed + as.factor(weathersit),
data = year2011)
diagnostics_df <- data.frame(
Residuals = resid(m.mlr),
Fitted_Values = fitted(m.mlr),
Standardized_Residuals = rstandard(m.mlr),
Leverage = hatvalues(m.mlr)
#Date = year2011$dteday
)
# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = Residuals)) +
geom_point(col="blue", alpha=0.75) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "Residuals") +
theme_bw()
# Create the QQ plot
ggplot(diagnostics_df, aes(sample = Standardized_Residuals)) +
stat_qq(aes(sample = Standardized_Residuals), distribution = qnorm,
size = 2, col="blue", alpha = 0.75) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Standardized Residual QQ Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_bw()
# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = sqrt(abs(Standardized_Residuals)))) +
geom_point(col="blue", alpha=0.75) +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
theme_bw()
# Leverage vs Standardized Residuals
ggplot(diagnostics_df, aes(x = Leverage, y = Standardized_Residuals)) +
geom_point(alpha = 0.75) +
labs(title = "Standardized Residuals vs. Leverage Plot",
x = "Leverage", y = "Standardized Residuals") +
theme_bw()
# Prediction
validation2012<- subset(validation_data)
validation2012$predicted_registered <- predict(m.mlr, newdata = validation2012)
summary(validation2012$predicted_registered)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -631.1 2260.8 3073.1 2829.8 3535.6 4281.9
# Extract observed and predicted values
observed_values <- validation2012$registered
predicted_values <- validation2012$predicted_registered
rmse <- RMSE(predicted_values, observed_values)
mae <- MAE(predicted_values, observed_values)
mape <- MAPE(predicted_values, observed_values)
r_squared <- R2_Score(predicted_values, observed_values)
cat("Root Mean Squared Error (RMSE):", round(rmse, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1930.15
cat("Mean Absolute Error (MAE):", round(mae, digits = 4), "\n")
## Mean Absolute Error (MAE): 1797.757
cat("R-squared (R^2) Score:", round(r_squared, digits = 4), "\n")
## R-squared (R^2) Score: -0.8414
cat("Mean Absolute Percentage Error (MPE):", round(mape, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6379
ggplot(validation2012, aes(x = registered, y = predicted_registered)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(x = "Observed Values", y = "Predicted Values",
title = "Observed vs. Predicted Values") +
theme_bw()
# Residuals plot
ggplot(validation2012, aes(x = 1:nrow(validation2012), y = registered-predicted_registered)) +
geom_point() +
geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
labs(x = "Observation Index", y = "Residuals",
title = "Observed vs. Predicted Values") +
theme_bw()